import pandas as pd
import json

# 读取 CSV
df = pd.read_csv("prediction_result/res2.csv")

# 按 Caption 分组
grouped = df.groupby("Caption")["Source"].apply(list).reset_index()

# 只保留重复的 Caption（Source 数量 > 1）
duplicates = grouped[grouped["Source"].apply(len) > 1]

# 转换为字典格式
result = []
for _, row in duplicates.iterrows():
    result.append({
        "Caption": row["Caption"],
        "Sources": row["Source"]
    })

# 保存为 JSON
with open("user_data/json_data/group_by_capt.json", "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print("已保存到 output.json")
